import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import scipy.stats as stats
import matplotlib
import math
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
data = pd.read_csv("C:/Users/didit/Downloads/DVE_Exam/datasets_2022/insurance.csv")
data.head()
| age | sex | bmi | children | smoker | region | charges | premium | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 350 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 200 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 100 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 200 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 350 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1353 entries, 0 to 1352 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1353 non-null int64 1 sex 1353 non-null object 2 bmi 1353 non-null float64 3 children 1353 non-null int64 4 smoker 1338 non-null object 5 region 1353 non-null object 6 charges 1338 non-null float64 7 premium 1353 non-null int64 dtypes: float64(2), int64(3), object(3) memory usage: 84.7+ KB
Given the above output, we discover that age and premium features are in terms of int64 data type sex,smoker,region are object data type lastly bmi and chargers are of float64 data type
data.nunique()
age 47 sex 2 bmi 563 children 6 smoker 2 region 4 charges 1337 premium 4 dtype: int64
the number of unique regions are 2
data.describe()
| age | bmi | children | charges | premium | |
|---|---|---|---|---|---|
| count | 1353.000000 | 1353.000000 | 1353.000000 | 1338.000000 | 1353.000000 |
| mean | 39.157428 | 30.670080 | 1.118256 | 13270.422265 | 287.730968 |
| std | 14.021578 | 6.093331 | 1.229290 | 12110.011237 | 152.363854 |
| min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 | 100.000000 |
| 25% | 26.000000 | 26.290000 | 0.000000 | 4740.287150 | 100.000000 |
| 50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 | 350.000000 |
| 75% | 51.000000 | 34.700000 | 2.000000 | 16639.912515 | 500.000000 |
| max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 | 500.000000 |
the median value for charges is 9382.03
data.describe(include=['object'])
| sex | smoker | region | |
|---|---|---|---|
| count | 1353 | 1338 | 1353 |
| unique | 2 | 2 | 4 |
| top | male | no | southeast |
| freq | 680 | 1064 | 369 |
data.shape
(1353, 8)
data.columns
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges',
'premium'],
dtype='object')
#data.isnull().sum()
data.dropna(inplace=True)
data.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 premium 0 dtype: int64
duplicated = data.duplicated().sum()
if duplicated:
print("number of duplicated rows in the dataset: {}".format(duplicated))
else:
print("the dataset does not contain any duplicates rows")
the dataset does not contain any duplicates rows
data.dropna(inplace=True)
data
| age | sex | bmi | children | smoker | region | charges | premium | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 350 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 200 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 100 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 200 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 350 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 350 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 500 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 200 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 100 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 100 |
1338 rows × 8 columns
#data['smoker'] = data.smoker.replace({"yes":1, "no": 0})
#Age category
#data.loc[(data['age']>=18) & (data['age']<36), 'age_category']='youth'
#data.loc[(data['age']>=36) & (data['age']<=55), 'age_category']='Snr_youth'
#data.loc[data['age']>55, 'age_category']= 'adults'
#data['age_category'].value_counts()
#data.loc[data['children']>0, 'depedent'] = 'yes'
#data.loc[data['children']==0, 'dependent'] = 'no'
#data['dependent'].value_counts()
#setting the weight conditions
data.loc[data['bmi']<18.5, 'weight_condtion'] = 'underweight'
data.loc[(data['bmi']>=18.5) & (data['bmi']<25), 'weight_condition'] = 'normal'
data.loc[(data['bmi']>=25) & (data['bmi']<30), 'weight_condition'] = 'overweight'
data.loc[data['bmi']>30, 'weight_condition']='obese'
data['weight_condition'].value_counts()
obese 705 overweight 386 normal 225 Name: weight_condition, dtype: int64
cat_data = data.select_dtypes('object')
dummy_data = pd.get_dummies(cat_data, drop_first=False)
#select numerical features
num_data= data.select_dtypes(include=[np.number])
#encode the data
data_enc = pd.concat([dummy_data,num_data], axis=1)
data_enc
| sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | age | bmi | children | charges | premium | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 19 | 27.900 | 0 | 16884.92400 | 350 |
| 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 18 | 33.770 | 1 | 1725.55230 | 200 |
| 2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 28 | 33.000 | 3 | 4449.46200 | 100 |
| 3 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 33 | 22.705 | 0 | 21984.47061 | 200 |
| 4 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 32 | 28.880 | 0 | 3866.85520 | 350 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 50 | 30.970 | 3 | 10600.54830 | 350 |
| 1334 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 18 | 31.920 | 0 | 2205.98080 | 500 |
| 1335 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 18 | 36.850 | 0 | 1629.83350 | 200 |
| 1336 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 21 | 25.800 | 0 | 2007.94500 | 100 |
| 1337 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 61 | 29.070 | 0 | 29141.36030 | 100 |
1338 rows × 13 columns
data_enc.plot(kind='box', sharex ='charges', subplots=True, layout=(2,7),
figsize=(15,10), color='deeppink');
sns.boxplot(x='charges', data=data_enc)
<AxesSubplot:xlabel='charges'>
There are alot of outliers towards the right from approximately 35000 to 650000
ax = sns.distplot(x=data_enc.charges, rug=True, hist= True)
As you can see the data is skewed to the left
sns.histplot(data['charges'], kde=True)
plt.title('Distribution of charges', fontsize=12)
plt.show()
given the above plots, we can see that most medical charges ranges betwen 1000 and 12 000, few medical charges cost more than 30k , and we can see say we have outliers
corr = data.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot= True, cmap= 'CMRmap')
<AxesSubplot:>
data_corr_matrix = data_enc.corr()
plt.figure(figsize=(14,12))
sns.heatmap(data_corr_matrix, annot=True)
plt.show()
theres correlation between charges and smokers which is the strongest, followed by bmi and age
data['charges'].groupby(data['smoker']).mean()
smoker no 8434.268298 yes 32050.231832 Name: charges, dtype: float64
data.smoker.value_counts()
no 1064 yes 274 Name: smoker, dtype: int64
plt.figure(figsize=(8,6))
sns.barplot(data.smoker, data.charges, hue= data.smoker, palette=['blue', 'red'], alpha = 0.5)
plt.show()
smokers have more charges compared to non smokers
ax = sns.countplot(x='smoker', data = data)
ax.bar_label(ax.containers[0])
plt.title("no of smokers vs non smokers", fontsize=12)
Text(0.5, 1.0, 'no of smokers vs non smokers')
there are 1064 non smokers and 274 smokers
sns.histplot(data=data, x='charges', hue='smoker');
plt.title('distribution of charges by smoker status', fontsize=12)
plt.show()
f,axes = plt.subplots(1,2, figsize=(20,10))
sns.histplot(data.loc[data['smoker']=='no', 'charges'], ax= axes[0]).set_title('Non-smokers', fontsize=12);
sns.histplot(data.loc[data['smoker']=='yes', 'charges'], ax= axes[1]).set_title('smokers', fontsize=12);
plt.show()
from the above we can conclude that, non smokers pay less compared to smokers
data.sex.value_counts()
male 676 female 662 Name: sex, dtype: int64
import plotly.express as px
fig = px.histogram(data,
x='charges', marginal='box', color='sex',
color_discrete_sequence= ['blue', 'red'],
title='difference in charges over sex')
fig.update_layout(bargap=0.1)
fig.show()
plt.figure(figsize=(12,10))
ax=sns.barplot(data.sex, data.charges, palette=['pink', 'blue'])
ax.bar_label(ax.containers[0])
plt.title('Charges over different genders')
plt.show()
given the above output, males are likely to pay more compared to females
ax = sns.countplot(x='sex', data=data)
ax.bar_label(ax.containers[0])
plt.title('No of Females vs Male')
plt.show()
sns.catplot(x='smoker', y='charges', hue='sex',
kind = 'violin', inner='quartiles', split=True,
palette='pastel', data=data)
<seaborn.axisgrid.FacetGrid at 0x2e01048e530>
data.groupby(['smoker','sex']).agg('count')['age']
smoker sex
no female 547
male 517
yes female 115
male 159
Name: age, dtype: int64
data.groupby(['smoker', 'sex']).agg('count')
| age | bmi | children | region | charges | premium | ||
|---|---|---|---|---|---|---|---|
| smoker | sex | ||||||
| no | female | 547 | 547 | 547 | 547 | 547 | 547 |
| male | 517 | 517 | 517 | 517 | 517 | 517 | |
| yes | female | 115 | 115 | 115 | 115 | 115 | 115 |
| male | 159 | 159 | 159 | 159 | 159 | 159 |
Observation made is that out of the total 1338 insured, 274 (20.5%) are smokers, and the rest are non smokers
sns.countplot(x='children', data=data_enc)
<AxesSubplot:xlabel='children', ylabel='count'>
fig = px.histogram(data,
x = 'charges', marginal = 'box', color = 'children',
color_discrete_sequence = ['yellow', 'green'],
title='different children over age')
fig.update_layout(bargap=0.2)
fig.show()
data.groupby(['children']).agg('count')['charges']
children 0 574 1 324 2 240 3 157 4 25 5 18 Name: charges, dtype: int64
ax = sns.barplot(x='children', y='charges', data=data)
ax.bar_label(ax.containers[0])
plt.title('No of kids over charges', fontsize=12)
plt.show()
approximately 85% of the insured have less than 3 children
data_enc[['charges','children']].corr()
| charges | children | |
|---|---|---|
| charges | 1.000000 | 0.067998 |
| children | 0.067998 | 1.000000 |
the correlation value between charges and children is 0.068, this indication of weak positive relation
fig = px.histogram(data,
x = 'children', marginal = 'box', color = 'age',
color_discrete_sequence = ['yellow', 'green'],
title='different children over age')
fig.update_layout(bargap=0.2)
fig.show()
ax= sns.barplot(x='children', y='age', data=data)
ax.bar_label(ax.containers[0])
plt.title('relationshion between children vs age', fontsize=12)
plt.show()
data_enc[['children','age']].corr()
| children | age | |
|---|---|---|
| children | 1.000000 | 0.042469 |
| age | 0.042469 | 1.000000 |
All the combinations of age and number of children are weakly correlated. the correlation coefficient for age and children is 0.042
fig = px.histogram(data,
x = 'age', marginal = 'box',
color_discrete_sequence = ['red'],
title='AGE difference')
fig.update_layout(bargap=0.2)
fig.show()
ax = sns.distplot(x=data.age, rug=True, hist= True)
its hard to tell, but from age 18-19 we have a total of 137 insured, the feature in non-uniform in distribution, the mean and median are far much apart
sns.scatterplot(data=data, x='age', y='charges')
plt.title('insurance charges vs age', fontsize=12)
plt.show()
plt.figure(figsize=(12,10))
ax=sns.barplot(data.age, data.charges, palette=['pink', 'blue'])
ax.bar_label(ax.containers[0])
plt.title('Charges over different age')
plt.show()
f, axes = plt.subplots(1,2, figsize=(12,8))
sns.scatterplot(data=data, x='age', y='bmi', ax=axes[0]).set_title('Age vs BMI', fontsize=12)
sns.scatterplot(data=data, x='age', y= 'bmi', hue='smoker', ax=axes[1]).set_title('Age vs charges-smokers(variable)', fontsize=12)
plt.show()
According to insurancedekho "Well, health insurance premium increases with age. The younger you are, the lower your premium will be. Likewise, the older you are, the higher your health insurance premium will be. Since at a young age, the chances of encountering health conditions and visiting the doctor is low, the cost of health insurance premium is low."
Other factors might influence change chargers over age
fig = px.histogram(data,
x = 'age', marginal = 'box', color = 'smoker',
color_discrete_sequence = ['lightblue', 'red'],
title='different age over smoker')
fig.update_layout(bargap=0.2)
fig.show()
pearson_coef, p_value = stats.pearsonr(data_enc['smoker_yes'], data_enc['age'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is -0.025018751536285088 with a P-value of P = 0.36048529037855864
plt.figure(figsize=(8,6))
sns.scatterplot(data.age, data.charges, hue=data.smoker,palette= ['red','green'] ,alpha=0.6)
plt.show()
given Pearson correlation coefficient is negative value, tells us theres no correlation with a low p-value of 0.36
smoking and age sns.distplot(data['bmi'])
plt.title('distribution of bmi',fontsize=12)
plt.show()
The plot is a uniform distribution of values in the ‘bmi’ feature. Thus, the feature is perfectly formatted with mean and median values close to each other.
sns.lmplot(x="bmi", y="charges", data=data, height=(8), hue="sex", palette=sns.color_palette("Set2",10))
<seaborn.axisgrid.FacetGrid at 0x2e016b47970>
sns.lmplot(x="bmi", y="charges", data=data, height=(8), hue="smoker", palette=sns.color_palette("Set2",10))
<seaborn.axisgrid.FacetGrid at 0x2e01a7a2f20>
data['bmi'].value_counts()
32.300 13
28.310 9
30.495 8
30.875 8
31.350 8
..
46.200 1
23.800 1
44.770 1
32.120 1
30.970 1
Name: bmi, Length: 548, dtype: int64
data['bmi'].nunique()
548
total = len(data)
plt.figure(figsize=(12,10))
plt.title('weight condition value counts')
ax=sns.countplot(x='weight_condition', hue='weight_condition',data=data)
for r in ax.patches:
percentage = '{0:.0f}%'.format(r.get_height()/ total * 100)
x = r.get_x() + r.get_width()/2
y = r.get_height() + 5
ax.annotate(percentage, (x,y), ha = 'center')
plt.show()
more than half of the population are considered obese
sns.set_style('darkgrid')
sns.catplot('sex', col='smoker', hue='weight_condition', data=data, kind='count')
<seaborn.axisgrid.FacetGrid at 0x2e01bc6b8e0>
data['weight_condition'].value_counts()
obese 705 overweight 386 normal 225 Name: weight_condition, dtype: int64
data.groupby(['weight_condition','charges']).mean()
| age | bmi | children | premium | ||
|---|---|---|---|---|---|
| weight_condition | charges | ||||
| normal | 1121.87390 | 18.0 | 23.21 | 0.0 | 500.0 |
| 1241.56500 | 19.0 | 19.80 | 0.0 | 200.0 | |
| 1242.26000 | 19.0 | 20.30 | 0.0 | 350.0 | |
| 1242.81600 | 19.0 | 20.70 | 0.0 | 350.0 | |
| 1515.34490 | 21.0 | 23.21 | 0.0 | 200.0 | |
| ... | ... | ... | ... | ... | ... |
| overweight | 32787.45859 | 42.0 | 28.31 | 3.0 | 350.0 |
| 35147.52848 | 24.0 | 28.50 | 0.0 | 100.0 | |
| 35160.13457 | 55.0 | 26.80 | 1.0 | 500.0 | |
| 37829.72420 | 43.0 | 27.80 | 0.0 | 200.0 | |
| 38245.59327 | 42.0 | 26.07 | 1.0 | 350.0 |
1315 rows × 4 columns
data.groupby(['weight_condition','charges']).sum('count')
| age | bmi | children | premium | ||
|---|---|---|---|---|---|
| weight_condition | charges | ||||
| normal | 1121.87390 | 18 | 23.21 | 0 | 500 |
| 1241.56500 | 19 | 19.80 | 0 | 200 | |
| 1242.26000 | 19 | 20.30 | 0 | 350 | |
| 1242.81600 | 19 | 20.70 | 0 | 350 | |
| 1515.34490 | 21 | 23.21 | 0 | 200 | |
| ... | ... | ... | ... | ... | ... |
| overweight | 32787.45859 | 42 | 28.31 | 3 | 350 |
| 35147.52848 | 24 | 28.50 | 0 | 100 | |
| 35160.13457 | 55 | 26.80 | 1 | 500 | |
| 37829.72420 | 43 | 27.80 | 0 | 200 | |
| 38245.59327 | 42 | 26.07 | 1 | 350 |
1315 rows × 4 columns
sns.scatterplot(data=data, x='bmi', y='charges', hue='weight_condition')
plt.title('charges by weight_condition', fontsize=12)
plt.show()
sns.set_style('darkgrid')
sns.catplot('region', col='smoker', hue='weight_condition', data=data, kind='count')
<seaborn.axisgrid.FacetGrid at 0x2e01bd9d390>
fig = px.histogram(data,
x = 'charges', marginal = 'box', color = 'region',
color_discrete_sequence = ['lightblue', 'red','yellow','green'],
title='different charges over regions')
fig.update_layout(bargap=0.2)
fig.show()
Given the above plot, southeast part of U.S is leading in charges but majority of all customers from all parts of US are charged between 0-20k only
px.histogram(data, x='sex', y='charges', color='region')
the inference from every region our customer base which has males incurring more bills but females of northwest region are having more medical bills
plt.figure(figsize=(14,12))
sns.heatmap(data_enc[['region_southwest','region_southeast',
'region_northwest','region_northeast',
'age', 'smoker_no','smoker_yes', 'charges']].corr(), annot=True)
plt.show()
pearson_coef, p_value = stats.pearsonr(data_enc['smoker_yes'], data_enc['charges'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.7872514304984773 with a P-value of P = 8.27143584217778e-283
pearson_coef, p_value = stats.pearsonr(data_enc['smoker_no'], data_enc['region_southwest'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.03694547401760744 with a P-value of P = 0.17681886419641932
pearson_coef, p_value = stats.pearsonr(data_enc['smoker_yes'], data_enc['region_northeast'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.002811134820450983 with a P-value of P = 0.9181759101730009
pearson_coef, p_value = stats.pearsonr(data_enc['region_southeast'], data_enc['charges'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.07398155156575978 with a P-value of P = 0.006782698910660704
pearson_coef, p_value = stats.pearsonr(data_enc['region_southeast'], data_enc['age'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is -0.011641940617229146 with a P-value of P = 0.6704990026067668
pearson_coef, p_value = stats.pearsonr(data_enc['age'], data_enc['charges'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.29900819333064765 with a P-value of P = 4.8866933317203816e-29
South East, among all the regions, southeast has strongest relationship with charges, with Pearson correlation coefficient 0.074
##### what might impact this?age and smoking is the most correlated with charges, unfortunately, age and smoking correlated to southeastplt.figure(figsize=(12,10))
ax=sns.barplot(data['region'], data['charges'], palette=['pink', 'blue'])
ax.bar_label(ax.containers[0])
plt.title('Charges over different age')
plt.show()
given the above plot Southeast region has the highest average medical charge
data_smoker= data[data['smoker']=='yes']
#Encode
cat_smoker = data_smoker.select_dtypes('object')
cat_smoker = cat_smoker.loc[:, cat_smoker.columns != 'weight_condition']
dummy_smoker = pd.get_dummies(cat_smoker,drop_first=False)
#numeric features
num_smoker = data_smoker.select_dtypes(include=[np.number])
#encoded
smoker_enc = pd.concat([dummy_smoker, num_smoker], axis=1)
plt.figure(figsize=(14,12))
sns.heatmap(smoker_enc.corr(), annot= True)
plt.show()
smoker_enc.corr()
| sex_female | sex_male | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | weight_condtion_underweight | age | bmi | children | charges | premium | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sex_female | 1.000000 | -1.000000 | NaN | 0.015134 | 0.084307 | -0.034441 | -0.060521 | 0.105058 | 0.005758 | -0.148350 | -0.076907 | -0.101226 | -0.180217 |
| sex_male | -1.000000 | 1.000000 | NaN | -0.015134 | -0.084307 | 0.034441 | 0.060521 | -0.105058 | -0.005758 | 0.148350 | 0.076907 | 0.101226 | 0.180217 |
| smoker_yes | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| region_northeast | 0.015134 | -0.015134 | NaN | 1.000000 | -0.294808 | -0.401187 | -0.294808 | 0.112757 | -0.011290 | -0.193326 | -0.033675 | -0.117370 | 0.060714 |
| region_northwest | 0.084307 | -0.084307 | NaN | -0.294808 | 1.000000 | -0.365412 | -0.268519 | -0.003898 | 0.030313 | -0.128827 | 0.073008 | -0.083583 | 0.012495 |
| region_southeast | -0.034441 | 0.034441 | NaN | -0.401187 | -0.365412 | 1.000000 | -0.365412 | -0.096140 | 0.061987 | 0.267021 | -0.069078 | 0.171069 | 0.021382 |
| region_southwest | -0.060521 | 0.060521 | NaN | -0.294808 | -0.268519 | -0.365412 | 1.000000 | -0.003898 | -0.089901 | 0.024379 | 0.042066 | 0.009843 | -0.101027 |
| weight_condtion_underweight | 0.105058 | -0.105058 | NaN | 0.112757 | -0.003898 | -0.096140 | -0.003898 | 1.000000 | -0.099223 | -0.281171 | 0.175515 | -0.156690 | -0.061793 |
| age | 0.005758 | -0.005758 | NaN | -0.011290 | 0.030313 | 0.061987 | -0.089901 | -0.099223 | 1.000000 | 0.059674 | 0.081183 | 0.368224 | -0.001100 |
| bmi | -0.148350 | 0.148350 | NaN | -0.193326 | -0.128827 | 0.267021 | 0.024379 | -0.281171 | 0.059674 | 1.000000 | -0.012619 | 0.806481 | 0.063274 |
| children | -0.076907 | 0.076907 | NaN | -0.033675 | 0.073008 | -0.069078 | 0.042066 | 0.175515 | 0.081183 | -0.012619 | 1.000000 | 0.035945 | -0.016293 |
| charges | -0.101226 | 0.101226 | NaN | -0.117370 | -0.083583 | 0.171069 | 0.009843 | -0.156690 | 0.368224 | 0.806481 | 0.035945 | 1.000000 | 0.077804 |
| premium | -0.180217 | 0.180217 | NaN | 0.060714 | 0.012495 | 0.021382 | -0.101027 | -0.061793 | -0.001100 | 0.063274 | -0.016293 | 0.077804 | 1.000000 |
pearson_coef, p_value = stats.pearsonr(smoker_enc['bmi'], smoker_enc['charges'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
The Pearson Correlation Coefficient is 0.8064806070155406 with a P-value of P = 5.019668631794899e-64
f, axes = plt.subplots(2,2, figsize=(20,15))
sns.histplot(data=data_smoker,
x= data_smoker['charges'], hue='region',
ax=axes[0,0]).set_title('Distribution of chargers for smokers by region',fontsize=12);
sns.histplot(data= data_smoker,
x = data_smoker['charges'], hue='weight_condition',
ax=axes[0,1]).set_title('Distribution of charges for smokers by weight_condition',fontsize=12)
sns.scatterplot(data= data_smoker, x='age',
y='charges', hue='region', ax=axes[1,0]).set_title('charges vs age for smokers by region', fontsize=12)
sns.scatterplot(data=data_smoker,
x='age', y='charges', hue='weight_condition', ax=axes[1,1]).set_title('age vs charges by weight condition',fontsize=12)
plt.show()
data_smoker.groupby(['weight_condition','region'])['charges'].mean().reset_index(name='average_charge').sort_values('average_charge',
ascending=False)
| weight_condition | region | average_charge | |
|---|---|---|---|
| 5 | obese | northwest | 42425.281095 |
| 6 | obese | southeast | 42256.051773 |
| 4 | obese | northeast | 41275.186048 |
| 7 | obese | southwest | 40592.695041 |
| 8 | overweight | northeast | 23355.678233 |
| 9 | overweight | northwest | 22667.486339 |
| 10 | overweight | southeast | 22292.695598 |
| 11 | overweight | southwest | 21630.652138 |
| 2 | normal | southeast | 21177.320841 |
| 1 | normal | northwest | 20115.304300 |
| 0 | normal | northeast | 19745.474630 |
| 3 | normal | southwest | 17366.054172 |
from the given output, we can conclude that northwest region shows an indication for higher average charges for clients(obese and greater)
#loading activity labels
activity = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/activity_labels.txt', sep=' ', header=None,
names=('ID','Activity'))
activity
| ID | Activity | |
|---|---|---|
| 0 | 1 | WALKING |
| 1 | 2 | WALKING_UPSTAIRS |
| 2 | 3 | WALKING_DOWNSTAIRS |
| 3 | 4 | SITTING |
| 4 | 5 | STANDING |
| 5 | 6 | LAYING |
#loading the features
feat = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/features.txt', sep=' ', header=None,
names=('ID','Sensors'))
feat.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 561 entries, 0 to 560 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 561 non-null int64 1 Sensors 561 non-null object dtypes: int64(1), object(1) memory usage: 8.9+ KB
feat.shape
(561, 2)
feat.head()
| ID | Sensors | |
|---|---|---|
| 0 | 1 | tBodyAcc-mean()-X |
| 1 | 2 | tBodyAcc-mean()-Y |
| 2 | 3 | tBodyAcc-mean()-Z |
| 3 | 4 | tBodyAcc-std()-X |
| 4 | 5 | tBodyAcc-std()-Y |
xtrain_df = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/train/X_train.txt', sep='\s+', header=None)
#add volunteer column to the dataframe
subtrain = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/train/subject_train.txt', header=None, names=['SubjectID'])
ytrain_df = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/train/y_train.txt', sep=' ', header=None, names=['ActivityID'])
Har_train_sensor = feat['Sensors']
xtrain_df.columns = Har_train_sensor
dat_har = pd.concat([xtrain_df,subtrain], axis=1)
for x in activity['ID']:
act = activity[activity['ID']== x] ['Activity']
ytrain_df = ytrain_df.replace({x: act.iloc[0]})
ytrain_df.columns =['Activity']
dat_har = pd.concat([dat_har,ytrain_df], axis=1)
print('X-train shape', xtrain_df.shape)
print('ysubtrain shape',subtrain.shape,'ytrain shape', ytrain_df.shape)
X-train shape (7352, 561) ysubtrain shape (7352, 1) ytrain shape (7352, 1)
dat_har.head()
| tBodyAcc-mean()-X | tBodyAcc-mean()-Y | tBodyAcc-mean()-Z | tBodyAcc-std()-X | tBodyAcc-std()-Y | tBodyAcc-std()-Z | tBodyAcc-mad()-X | tBodyAcc-mad()-Y | tBodyAcc-mad()-Z | tBodyAcc-max()-X | ... | fBodyBodyGyroJerkMag-kurtosis() | angle(tBodyAccMean,gravity) | angle(tBodyAccJerkMean),gravityMean) | angle(tBodyGyroMean,gravityMean) | angle(tBodyGyroJerkMean,gravityMean) | angle(X,gravityMean) | angle(Y,gravityMean) | angle(Z,gravityMean) | SubjectID | Activity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.288585 | -0.020294 | -0.132905 | -0.995279 | -0.983111 | -0.913526 | -0.995112 | -0.983185 | -0.923527 | -0.934724 | ... | -0.710304 | -0.112754 | 0.030400 | -0.464761 | -0.018446 | -0.841247 | 0.179941 | -0.058627 | 1 | STANDING |
| 1 | 0.278419 | -0.016411 | -0.123520 | -0.998245 | -0.975300 | -0.960322 | -0.998807 | -0.974914 | -0.957686 | -0.943068 | ... | -0.861499 | 0.053477 | -0.007435 | -0.732626 | 0.703511 | -0.844788 | 0.180289 | -0.054317 | 1 | STANDING |
| 2 | 0.279653 | -0.019467 | -0.113462 | -0.995380 | -0.967187 | -0.978944 | -0.996520 | -0.963668 | -0.977469 | -0.938692 | ... | -0.760104 | -0.118559 | 0.177899 | 0.100699 | 0.808529 | -0.848933 | 0.180637 | -0.049118 | 1 | STANDING |
| 3 | 0.279174 | -0.026201 | -0.123283 | -0.996091 | -0.983403 | -0.990675 | -0.997099 | -0.982750 | -0.989302 | -0.938692 | ... | -0.482845 | -0.036788 | -0.012892 | 0.640011 | -0.485366 | -0.848649 | 0.181935 | -0.047663 | 1 | STANDING |
| 4 | 0.276629 | -0.016570 | -0.115362 | -0.998139 | -0.980817 | -0.990482 | -0.998321 | -0.979672 | -0.990441 | -0.942469 | ... | -0.699205 | 0.123320 | 0.122542 | 0.693578 | -0.615971 | -0.847865 | 0.185151 | -0.043892 | 1 | STANDING |
5 rows × 563 columns
dat_har.shape
(7352, 563)
# check duplicates
duplicated = dat_har.duplicated().sum()
if duplicated:
print("number of duplicated rows in the dataset: {}".format(duplicated))
else:
print("the dataset does not contain any duplicates rows")
the dataset does not contain any duplicates rows
dat_har.describe()
| tBodyAcc-mean()-X | tBodyAcc-mean()-Y | tBodyAcc-mean()-Z | tBodyAcc-std()-X | tBodyAcc-std()-Y | tBodyAcc-std()-Z | tBodyAcc-mad()-X | tBodyAcc-mad()-Y | tBodyAcc-mad()-Z | tBodyAcc-max()-X | ... | fBodyBodyGyroJerkMag-skewness() | fBodyBodyGyroJerkMag-kurtosis() | angle(tBodyAccMean,gravity) | angle(tBodyAccJerkMean),gravityMean) | angle(tBodyGyroMean,gravityMean) | angle(tBodyGyroJerkMean,gravityMean) | angle(X,gravityMean) | angle(Y,gravityMean) | angle(Z,gravityMean) | SubjectID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | ... | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 | 7352.000000 |
| mean | 0.274488 | -0.017695 | -0.109141 | -0.605438 | -0.510938 | -0.604754 | -0.630512 | -0.526907 | -0.606150 | -0.468604 | ... | -0.307009 | -0.625294 | 0.008684 | 0.002186 | 0.008726 | -0.005981 | -0.489547 | 0.058593 | -0.056515 | 17.413085 |
| std | 0.070261 | 0.040811 | 0.056635 | 0.448734 | 0.502645 | 0.418687 | 0.424073 | 0.485942 | 0.414122 | 0.544547 | ... | 0.321011 | 0.307584 | 0.336787 | 0.448306 | 0.608303 | 0.477975 | 0.511807 | 0.297480 | 0.279122 | 8.975143 |
| min | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -0.999873 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | -0.995357 | -0.999765 | -0.976580 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 1.000000 |
| 25% | 0.262975 | -0.024863 | -0.120993 | -0.992754 | -0.978129 | -0.980233 | -0.993591 | -0.978162 | -0.980251 | -0.936219 | ... | -0.542602 | -0.845573 | -0.121527 | -0.289549 | -0.482273 | -0.376341 | -0.812065 | -0.017885 | -0.143414 | 8.000000 |
| 50% | 0.277193 | -0.017219 | -0.108676 | -0.946196 | -0.851897 | -0.859365 | -0.950709 | -0.857328 | -0.857143 | -0.881637 | ... | -0.343685 | -0.711692 | 0.009509 | 0.008943 | 0.008735 | -0.000368 | -0.709417 | 0.182071 | 0.003181 | 19.000000 |
| 75% | 0.288461 | -0.010783 | -0.097794 | -0.242813 | -0.034231 | -0.262415 | -0.292680 | -0.066701 | -0.265671 | -0.017129 | ... | -0.126979 | -0.503878 | 0.150865 | 0.292861 | 0.506187 | 0.359368 | -0.509079 | 0.248353 | 0.107659 | 26.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.916238 | 1.000000 | 1.000000 | 0.967664 | 1.000000 | 1.000000 | ... | 0.989538 | 0.956845 | 1.000000 | 1.000000 | 0.998702 | 0.996078 | 1.000000 | 0.478157 | 1.000000 | 30.000000 |
8 rows × 562 columns
print('number of unique classes:',dat_har['Activity'].nunique())
print('unique classes:', dat_har['Activity'].unique())
number of unique classes: 6 unique classes: ['STANDING' 'SITTING' 'LAYING' 'WALKING' 'WALKING_DOWNSTAIRS' 'WALKING_UPSTAIRS']
plt.figure(figsize=(12,10))
ax=sns.countplot(dat_har['Activity'], data=dat_har)
ax.bar_label(ax.containers[0])
plt.title('No of entries per class', fontsize=12)
plt.xticks(rotation=45);
plt.show()
plt.figure(figsize=(12,10))
ax = sns.countplot(dat_har['SubjectID'], data=dat_har)
ax.bar_label(ax.containers[0])
plt.title('No of entries per user', fontsize=12)
plt.xticks(rotation=0);
plt.show()
plt.figure(figsize=(12,8))
plt.title('No of entries per user by class', fontsize=12)
sns.countplot(x='SubjectID', hue='Activity', data= dat_har )
plt.show()
number of entries per user by class are almost balanced for all users excluding Id 1, 19 and 20
dat_har.columns[:10]
Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
'tBodyAcc-max()-X'],
dtype='object')
activity.nunique()
sns.boxplot(x='SubjectID', y='angle(Y,gravityMean)', data= dat_har)
plt.show()
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if dataset.dtypes[column] == np.object:
g = sns.countplot(y=column, data=dataset)
substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)
plt.xticks(rotation=25)
else:
g = sns.distplot(dataset[column])
plt.xticks(rotation=25)
cols_to_plot = ['angle(X,gravityMean)','tBodyAcc-mean()-X']
plot_distribution(dat_har[cols_to_plot], cols=3, width=20, height=20, hspace=0.45, wspace=0.5)
xtest_df = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/test/X_test.txt', sep='\s+', header=None)
#add volunteer column to the dataframe
sub_test = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/test/subject_test.txt', header=None, names=['SubjectID'])
ytest_df = pd.read_table('C:/Users/didit/Downloads/DVE_Exam/datasets_2022/UCI HAR Dataset/test/y_test.txt', sep=' ', header=None, names=['ActivityID'])
Har_test_sensor = feat['Sensors']
xtest_df.columns = Har_test_sensor
test_har = pd.concat([xtest_df,sub_test], axis=1)
for x in activity['ID']:
act = activity[activity['ID']== x] ['Activity']
ytest_df = ytest_df.replace({x: act.iloc[0]})
ytest_df.columns =['Activity']
test_har = pd.concat([test_har,ytest_df], axis=1)
test_har.shape